1 Required libraries

The required libraries are loaded - RomicsProcessor written by Geremy Clair (2021) is used to perform trackable transformation and statistics to the dataset - proteinminion written by Geremy Clair (2021) is used to extract fasta information and to perform gene ontology and KEGG pathways enrichement analysis (2021)

library("RomicsProcessor")
library("proteinminion")
library("DT") #for the rendering of the enrichment tables 

2 Fasta and protein ontologies download using ‘Protein Mini-On’

Using the package ‘Protein Mini-on’ (Geremy Clair 2021, in prep.), The fasta file was downloaded from Unipro for the human and bovine proteome on the Jun 15th, 2020

if(!file.exists("./03_output_files/Uniprot_Homo_sapiens_proteome_UP000005640_2020_06_15.fasta")){
    download_UniProtFasta(proteomeID = "UP000005640",reviewed = F,export = TRUE, file="./03 - Output files/Uniprot_Homo_sapiens_proteome_UP000005640_2020_06_15.fasta")
}

UniProtFasta_info<-UniprotFastaParser(file = "./03_output_files/Uniprot_Homo_sapiens_proteome_UP000005640_2020_06_15.fasta")
## [1] "./03_output_files/Uniprot_Homo_sapiens_proteome_UP000005640_2020_06_15.fasta"
write.csv(UniProtFasta_info, "./03_output_files/UniProtFasta_info.csv")

For each entry, ‘Protein Mini-On’ was use to download Gene Ontology (GO) terms and KEGG ids associated with the proteins. This upload was performed the exact same day as the download of the fasta file was done to ensure that the IDs will be identical as the ones present in the fasta file used).

if(file.exists("./03_output_files/UniprotTable_Homo_sapiens_proteome_UP000005640_2020_06_15.csv")){
  UniProtTable<-read.csv("./03_output_files/UniprotTable_Homo_sapiens_proteome_UP000005640_2020_06_15.csv")
  }else{
  download_UniProtTable(proteomeID = "UP000005640",reviewed = F)
  write.csv(UniProtTable,("./03_output_files/UniprotTable_Homo_sapiens_proteome_UP000005640_2020_06_15.csv"),row.names=FALSE)
  }

‘Protein-Mini-on’ was then used to generate a table (UniProtTable) containing the list of GOs and their associated protein IDs

if(file.exists("./03_output_files/UniProtTable_GO.csv")){
  UniProtTable_GO<-read.csv(file="./03_output_files/UniProtTable_GO.csv")
}else{
generate_UniProtTable_GO()
write.csv(UniProtTable_GO,file="./03_output_files/UniProtTable_GO.csv",row.names=FALSE)
}

‘Protein-Mini-on’ was used to download similar information from KEGG for the Pathways associated with each protein

if(file.exists("./03_output_files/UniProtTable_KEGG.csv")){
  UniProtTable_KEGG<-read.csv(file="./03_output_files/UniProtTable_KEGG.csv")
}else{
generate_UniProtTable_KEGG()
write.csv(UniProtTable_KEGG,file="./03_output_files/UniProtTable_KEGG.csv",row.names=FALSE)
}

3 MaxQuant import

The LFQ data contained in the protein table was loaded, the corresponding metadata was loaded

data<-extractMaxQuant("./01_source_files/proteinGroups.txt",quantification_type = "LFQ",cont.rm = T,site.rm = T,rev.rm = T)
## [1] "282  Proteins were removed (protein(s) only identified by site,contaminant(s),reverse hit(s))"
## [1] "LFQ quantification was used"
IDsdetails<-extractMaxQuantIDs("./01_source_files/proteinGroups.txt",cont.rm = T,site.rm = T,rev.rm = T)
## [1] "282  Proteins were removed (protein(s) only identified by site,contaminant(s),reverse hit(s))"
IDsdetails<-cbind(UniProt_Name=sub(".*\\|","",IDsdetails$protein.ids), IDsdetails)
colnames(data)<- sub("LFQ.intensity.","p",colnames(data))
metadata<- read.csv(file = "./01_source_files/metadata.csv")
colnames(metadata)<-tolower(colnames(metadata))
colnames(metadata)<-sub("x", "",colnames(metadata))
write.csv(IDsdetails,"MaxQuantIDS.csv")

4 Romics_object creation

The data and metadata were placed in an romics_object, the sample names were retrieved from the metadata, the condition will be use for the coloring of the Figure and statistics

colnames(data)[-1]<-gsub("p","",colnames(data)[-1])
romics_proteins<- romicsCreateObject(data, metadata,main_factor = "Condition")

5 Full data analysis

5.1 Data cleaning and normalization

The missingness was evaluated for each sample

romics_proteins<- romicsZeroToMissing(romics_proteins)
romicsPlotMissing(romics_proteins)

This result suggest that the Mono cultures were either more concentrated in proteins or that the Co Culture were heavily contaminated with some serum or matrix proteins…

The proteins to be conserved for quantification were selected to contain at least 70% of complete values (3/4 samples) for a given condition, the overall missingness was evaluated after filtering.

romics_proteins<-romicsFilterMissing(romics_proteins, percentage_completeness = 70)
## [1] "1289 rows were removed for the data"
## [1] "Based on the minimum completeness set at 70%"
## [1] "at least the following number of sample(s) containing data was required:"
##    EC_Coculture  EC_Monoculture   EVT_Coculture EVT_monoculture 
##               3               3               3               3 
## [1] "2623/3912 proteins remained after filtering (67.05%)."
print(paste0(nrow(romics_proteins$data),"/", nrow(romics_proteins$original_data)," proteins remained after filtering", " (",round(nrow(romics_proteins$data)/nrow(romics_proteins$original_data)*100,2),"%)."))
## [1] "2623/3912 proteins remained after filtering (67.05%)."
romicsPlotMissing(romics_proteins)

As the same quantity of protein was labelled for each sample, the expectation is that the distribution of the protein abundance is centered, therefore a median centering was performed prior to plot again the distribution boxplots. However here due to the large missingness I am not certain that this is the best normalization strategy

romics_proteins<-log2transform(romics_proteins)
romics_proteins<-medianCenterSample(romics_proteins)
distribBoxplot(romics_proteins)

5.2 Grouping evaluation

The grouping of the samples by is checked by hierarchical clustering

romicsHclust(romics_proteins)

romicsHclust(romicsSubset(romics_proteins,subset_vector = c("EVT_monoculture","EVT_Coculture"),by = "level",type = "keep",factor = "Condition"))

romicsHclust(romicsSubset(romics_proteins,subset_vector = c("EVT_monoculture","EVT_Coculture"),by = "level",type = "drop",factor = "Condition"))

5.3 Data imputation

For some of the subsequent statistics imputations are required, we performed an imputation by assuming that the “non-detected” proteins were either low abundance or missing using the method developped by Tyranova et al. (PMID: 27348712). The gray distribution is the data distribution, the yellow distribution is the one for the random values used for imputation.

imputeMissingEval(romics_proteins,nb_stdev = 2,width_stdev = 0.5, bin=1)

romics_proteins<-imputeMissing(romics_proteins,nb_stdev = 2,width_stdev = 0.5)

The hclust and PCA grouping were checked again after imputation

PCA_proteins<-romicsPCA(romics_proteins)
indPCAplot(romics_proteins, plotType = "percentage")

indPCAplot(romics_proteins, plotType = "individual",Xcomp=1,Ycomp =2)

indPCAplot(romics_proteins,  plotType = "individual",Xcomp=1,Ycomp =3)

indPCA3D(romics_proteins)
romics_EVT<-romicsSubset(romics_proteins,subset_vector = c("EVT_monoculture","EVT_Coculture"),by = "level",type = "keep",factor = "Condition")
romics_EC<-romicsSubset(romics_proteins,subset_vector = c("EVT_monoculture","EVT_Coculture"),by = "level",type = "drop",factor = "Condition")

indPCAplot(romics_EVT, plotType = "individual")

PCA_EVT<-romicsPCA(romics_proteins)

indPCAplot(romics_EC, plotType = "individual")

PCA_EC<-romicsPCA(romics_proteins)

Most of the grouping is due to the Co-Culture vs mono-culture and probably derive from the important missingness in the co-cultures. On the second PC axis the cell group by type.

5.4 Statistics

the means and stdev were calculated for each group

romics_proteins<-romicsMean(romics_proteins)
## [1] "The Statistics layer was added to your object"
## [1] "Means columns (*_mean) were added to the statistics"
romics_proteins<-romicsSd(romics_proteins)
## [1] "The standard deviation columns (*_sd) were added to the statistics"

An ANOVA was performed

romics_proteins<-romicsANOVA(romics_proteins)
## [1] "The ANOVA columns (ANOVA_p and ANOVA_padj) were added to the statistics"
print(paste0(sum(romics_proteins$statistics$ANOVA_p<0.05), " proteins had an ANOVA p<0.05."))
## [1] "2213 proteins had an ANOVA p<0.05."
pval<-data.frame(ids=rownames(romics_proteins$statistics), p=romics_proteins$statistics$ANOVA_p)
ggplot(pval, aes(p)) + geom_histogram(binwidth = 0.01)+theme_ROP()+ggtitle("ANOVA p frequency plot")+geom_vline(xintercept=0.05,linetype="dashed", color = "red")

print(paste0(sum(romics_proteins$statistics$ANOVA_padj<0.05), " proteins had an ANOVA padjusted<0.05."))
## [1] "2184 proteins had an ANOVA padjusted<0.05."
pval<-data.frame(ids=rownames(romics_proteins$statistics), p=romics_proteins$statistics$ANOVA_padj)
ggplot(pval, aes(p)) + geom_histogram(binwidth = 0.01)+theme_ROP()+ggtitle("ANOVA padj frequency plot")+geom_vline(xintercept=0.05,linetype="dashed", color = "red")

A heatmap depicting the proteins passing an ANOVA p<0.05 is plotted, the clusters obtained were saved in the statistics.

romicsHeatmap(romics_proteins,variable_hclust_number = 2,ANOVA_filter = "padj", p=0.01)

romics_proteins<-romicsVariableHclust(romics_proteins,clusters = 2,ANOVA_filter = "padj",p= 0.01,plot = F)
## [1] "The columns hclust_clusters was added to the statistics"
romics_proteins<-romicsZscores(romics_proteins)
## [1] "Z_score_ columns were added to the statistics"

Most of the ANOVA results seem to derive from the large missingness in the co-culture vs mono-culture

Therefore Student’s T.test were performed between all the conditions tested

romics_proteins<-romicsTtest(romics_proteins)
## [1] "T_test columns were added to the statistics"

Volcano plots were plotted

romicsVolcano(romics_proteins,p_type = "p",p = 0.05,plot = "all",min_fold_change = 0.6)
## [1] "'stat_type' was missing 't.test' were used by default"

I believe that the most interesting proteins are those that are lower in the monoculture (therefore higher in the co-culture vs the monoculture for a given cell type)

Universe<-rownames(romics_proteins$statistics)
Universe<-gsub("\\;.*","",Universe)

EC_high_in_CO_vs_mono<- Universe[romics_proteins$statistics$EC_Monoculture_vs_EC_Coculture_Ttest_p<0.05 & romics_proteins$statistics$`log(EC_Monoculture/EC_Coculture)`<0]

EVT_high_in_CO_vs_mono<- Universe[romics_proteins$statistics$EVT_monoculture_vs_EVT_Coculture_Ttest_p<0.05 & romics_proteins$statistics$`log(EVT_monoculture/EVT_Coculture)`<0]

EC_high_in_CO_vs_mono_enrichment<-cbind(enrichement_for="EC_high_in_CO_vs_mono",UniProt_GO_Fisher(EC_high_in_CO_vs_mono,Universe))
## [1] "Your <query> contained 0 UniProt_IDs and 29 UniProt_Accession (some might be redundant)."
## [1] "The uniprot_Accession of your query were converted in Uniprot_IDs."
## [1] "Your universe contained 0 UniProt_IDs and 2623 UniProt_Accession (some might be redundant)."
## [1] "The uniprot_Accession of your universe were converted in Uniprot_IDs."
EVT_high_in_CO_vs_mono_enrichment<-cbind(enrichement_for="EVT_high_in_CO_vs_mono",UniProt_GO_Fisher(EVT_high_in_CO_vs_mono,Universe))
## [1] "Your <query> contained 0 UniProt_IDs and 35 UniProt_Accession (some might be redundant)."
## [1] "The uniprot_Accession of your query were converted in Uniprot_IDs."
## [1] "Your universe contained 0 UniProt_IDs and 2623 UniProt_Accession (some might be redundant)."
## [1] "The uniprot_Accession of your universe were converted in Uniprot_IDs."
Enrichment<-rbind(EC_high_in_CO_vs_mono_enrichment,EVT_high_in_CO_vs_mono_enrichment)
Enrichment<-Enrichment[Enrichment$pval<0.05&Enrichment$fold_change>0,]

datatable(Enrichment)

The data was exported for further exploration

results<-romicsExportData(romics_proteins,statistics = T,missing_data = T)
results<-merge(IDsdetails,results,by.x = "UniProt_Name", by.y=0)
representativeIDs<- data.frame(Uniprot_Accession=gsub("\\;.*","",results$majority.protein.ids))
representativeIDs<- merge(representativeIDs,UniProtTable,by="Uniprot_Accession")
results<-cbind(representativeIDs,results)
write.csv(results, "./03_output_files/Complete_results.csv")